###################
#load libraries
###################
library(dplyr)
##
## Attache Paket: 'dplyr'
## Die folgenden Objekte sind maskiert von 'package:stats':
##
## filter, lag
## Die folgenden Objekte sind maskiert von 'package:base':
##
## intersect, setdiff, setequal, union
library(Seurat)
## Attaching SeuratObject
library(patchwork)
library(ggplot2)
Setup the Seurat Object
############################
#Setup the Seurat Object
############################
# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "./data_new/002/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc
## An object of class Seurat
## 22302 features across 11922 samples within 1 assay
## Active assay: RNA (22302 features, 0 variable features)
Standard pre-processing workflow
QC and selecting cells for further analysis
- Visualize QC metrics as a violin plot
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

- FeatureScatter is typically used to visualize feature-feature relationships, but can be used for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

pbmc <- subset(pbmc, subset = nFeature_RNA > 1000 & nFeature_RNA < 4000 & percent.mt < 15)
Normalizing the data
#pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- NormalizeData(pbmc)
Identification of highly variable features (feature selection)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 + plot2
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 6 rows containing missing values (geom_point).

Scaling the data
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix
Perform linear dimensional reduction
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1
## Positive: LTB, CD3E, IL32, CD3D, TRAC, PCED1B-AS1, CD3G, TRBC2, ETS1, MALAT1
## BCL11B, IL7R, ARL4C, CD2, LIME1, TCF7, CD7, LINC00861, PRKCQ-AS1, CD27
## FCMR, CCR7, PIK3IP1, CD247, TRBC1, GZMM, LEF1, NOSIP, ISG20, BCL2
## Negative: FGL2, FCN1, LYZ, CST3, MNDA, CTSS, TYMP, CYBB, SERPINA1, IFI30
## PSAP, NCF2, S100A9, LST1, TYROBP, AIF1, CSTA, MPEG1, TNFSF13B, TMEM176B
## FTL, DUSP6, CD68, MS4A6A, S100A8, DUSP1, FOS, GRN, AC020656.1, CD36
## PC_ 2
## Positive: MS4A1, CD79A, IGHM, BANK1, CD79B, LINC00926, RALGPS2, TNFRSF13C, IGHD, NIBAN3
## CD22, HLA-DQA1, IGKC, BCL11A, AFF3, LINC02397, SPIB, PAX5, VPREB3, FCRL1
## BLK, FCER2, CD74, HLA-DRA, SWAP70, HLA-DRB1, COBLL1, HLA-DOB, BLNK, P2RX5
## Negative: IL32, CD3E, GZMM, CD3D, CTSW, CD2, CD247, ANXA1, CD3G, CD7
## GZMA, BCL11B, S100A4, NKG7, CST7, TRAC, PRF1, LINC00861, TMSB4X, GNLY
## CCL5, ARL4C, KLRK1, KLRD1, S100A10, SAMD3, IL7R, ID2, FGFBP2, TRBC1
## PC_ 3
## Positive: IL7R, CCR7, LEF1, TCF7, MAL, RCAN3, TRABD2A, PIK3IP1, NOSIP, CD27
## OXNAD1, CAMK4, LTB, PRKCQ-AS1, TRAC, PDE3B, CHRM3-AS2, BCL11B, FHIT, LRRN3
## VIM, RGCC, NELL2, PASK, CD3G, CD3D, TRAT1, AQP3, ABLIM1, S100A12
## Negative: NKG7, GNLY, GZMB, KLRD1, CST7, PRF1, FGFBP2, GZMA, KLRF1, CCL4
## HOPX, SPON2, CLIC3, GZMH, CCL5, ADGRG1, FCGR3A, PTGDR, CD160, TBX21
## XCL2, MATK, IL2RB, C12orf75, TRDC, LAIR2, CTSW, PLEK, S1PR5, FCRL6
## PC_ 4
## Positive: CDKN1C, HES4, TCF7L2, FCGR3A, CTSL, BATF3, MS4A7, SIGLEC10, CSF1R, CKB
## MTSS1, CASP5, IFITM3, AC104809.2, RRAS, CALML4, SMIM25, MS4A4A, ABI3, AC064805.1
## RHOC, CAMK1, NEURL1, CALHM6, NAP1L1, GPBAR1, HMOX1, HCAR3, ZNF703, LY6E
## Negative: S100A12, SLC2A3, VCAN, PADI4, CYP1B1, ALOX5AP, ITGAM, CES1, MGST1, S100A8
## RNASE6, MEGF9, METTL9, QPCT, MCEMP1, VNN2, THBS1, PGD, CTSD, RBP7
## CD14, RNASE2, BST1, AC020916.1, F5, PLBD1, MARC1, CD36, CKAP4, CRISPLD2
## PC_ 5
## Positive: LILRA4, CLEC4C, SERPINF1, LRRC26, TPM2, SCT, DNASE1L3, MAP1A, PLD4, IL3RA
## LINC00996, EPHB1, PTCRA, ITM2C, LAMP5, SMPD3, PACSIN1, CIB2, TNFRSF21, SCAMP5
## PHEX, DERL3, GAS6, PLEKHD1, SCN9A, AC097375.1, ZFAT, SMIM5, AL096865.1, NRP1
## Negative: MS4A1, LINC00926, CD79A, BANK1, TNFRSF13C, IGHD, CD79B, CD22, FCRL1, RALGPS2
## PAX5, VPREB3, FCRL3, GNLY, FCER2, KLRD1, LINC02397, CCL4, FGFBP2, PRF1
## CST7, HLA-DOB, NKG7, HOPX, P2RX5, ADAM28, GZMA, KLRF1, SWAP70, GZMH
# Examine and visualize PCA results a few different ways
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: LTB, CD3E, IL32, CD3D, TRAC
## Negative: FGL2, FCN1, LYZ, CST3, MNDA
## PC_ 2
## Positive: MS4A1, CD79A, IGHM, BANK1, CD79B
## Negative: IL32, CD3E, GZMM, CD3D, CTSW
## PC_ 3
## Positive: IL7R, CCR7, LEF1, TCF7, MAL
## Negative: NKG7, GNLY, GZMB, KLRD1, CST7
## PC_ 4
## Positive: CDKN1C, HES4, TCF7L2, FCGR3A, CTSL
## Negative: S100A12, SLC2A3, VCAN, PADI4, CYP1B1
## PC_ 5
## Positive: LILRA4, CLEC4C, SERPINF1, LRRC26, TPM2
## Negative: MS4A1, LINC00926, CD79A, BANK1, TNFRSF13C
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")

DimPlot(pbmc, reduction = "pca")

DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)

Determine the ‘dimensionality’ of the dataset
# NOTE: This process can take a long time for big datasets, comment out for expediency. More
# approximate techniques such as those implemented in ElbowPlot() can be used to reduce
# computation time
pbmc <- JackStraw(pbmc, num.replicate = 100, dims = 50)
pbmc <- ScoreJackStraw(pbmc, dims = 1:50)
JackStrawPlot(pbmc, dims = 1:40)
## Warning: Removed 61657 rows containing missing values (geom_point).

ElbowPlot(pbmc, ndims = 50)

Cluster the cells
pbmc <- FindNeighbors(pbmc, dims = 1:36)
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 10877
## Number of edges: 427932
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9139
## Number of communities: 17
## Elapsed time: 2 seconds
# Look at cluster IDs of the first 5 cells
head(Idents(pbmc), 5)
## AAACCCAAGGCCCAAA-1 AAACCCAAGTAATACG-1 AAACCCAAGTCACACT-1 AAACCCACAAAGCGTG-1
## 7 0 0 1
## AAACCCACAATCGAAA-1
## 6
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
Run non-linear dimensional reduction (UMAP/tSNE)
# If you haven't installed UMAP, you can do so via reticulate::py_install(packages =
# 'umap-learn')
pbmc <- RunUMAP(pbmc, dims = 1:36)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 20:39:29 UMAP embedding parameters a = 0.9922 b = 1.112
## 20:39:29 Read 10877 rows and found 36 numeric columns
## 20:39:29 Using Annoy for neighbor search, n_neighbors = 30
## 20:39:29 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 20:39:31 Writing NN index file to temp file C:\Users\Daniel\AppData\Local\Temp\RtmpuMS6TS\file3f8413750d
## 20:39:31 Searching Annoy index using 1 thread, search_k = 3000
## 20:39:34 Annoy recall = 100%
## 20:39:34 Commencing smooth kNN distance calibration using 1 thread
## 20:39:35 Initializing from normalized Laplacian + noise
## 20:39:35 Commencing optimization for 200 epochs, with 470104 positive edges
## 20:39:47 Optimization finished
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc, reduction = "umap")

#UMAPPlot(pbmc)
#saveRDS(pbmc, file = "./pbmc_tutorial.rds")
#run TSNE
pbmc <- RunTSNE(pbmc, dims = 1:36)
TSNEPlot(pbmc)

Finding differentially expressed features (cluster biomarkers)
- find all markers of cluster 2
# find all markers of cluster 2
cluster2.markers <- FindMarkers(pbmc, ident.1 = 2, min.pct = 0.25)
head(cluster2.markers, n = 5)
- find all markers distinguishing cluster 5 from clusters 0 and 3
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
- find markers for every cluster compared to all remaining cells, report only the positive ones
# find markers for every cluster compared to all remaining cells, report only the positive
# ones
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
## Calculating cluster 16
pbmc.markers %>%
group_by(cluster) %>%
slice_max(n = 2, order_by = avg_log2FC) -> top2
top_gene <- top2$gene[seq(1, nrow(top2), 2)]
top2
#cluster0.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
VlnPlot(pbmc, features = top_gene, ncol = 3)

# you can plot raw counts as well
VlnPlot(pbmc, features = top_gene, slot = "counts", log = TRUE, ncol = 3)

FeaturePlot(pbmc, features = top_gene, ncol = 3)

FeaturePlot(pbmc, features = top_gene, reduction = "tsne", ncol = 3)

pbmc.markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(pbmc, features = top10$gene) + NoLegend()

Assigning cell type identity to clusters
# new.cluster.ids <- c("Naive CD4 T", "CD14+ Mono", "Memory CD4 T", "B", "CD8 T", "FCGR3A+ Mono",
# "NK", "DC", "Platelet")
#names(new.cluster.ids) <- levels(pbmc)
#pbmc <- RenameIdents(pbmc, new.cluster.ids)
#DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()